options(warn=-1)
library(astsa) #provides time series model functions (arima, sarima)
library(TSstudio) #plotly library for plotting time series data
library(xts) #library to convert data frame to time series object
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
birth.data=read.csv("daily-total-female-births-CA.csv")
head(birth.data)
## date births
## 1 1959-01-01 35
## 2 1959-01-02 32
## 3 1959-01-03 30
## 4 1959-01-04 31
## 5 1959-01-05 44
## 6 1959-01-06 29
str(birth.data)
## 'data.frame': 365 obs. of 2 variables:
## $ date : chr "1959-01-01" "1959-01-02" "1959-01-03" "1959-01-04" ...
## $ births: int 35 32 30 31 44 29 45 43 38 27 ...
birth.data$date<-as.Date(birth.data$date)
head(birth.data)
## date births
## 1 1959-01-01 35
## 2 1959-01-02 32
## 3 1959-01-03 30
## 4 1959-01-04 31
## 5 1959-01-05 44
## 6 1959-01-06 29
#creating a separate variable for just the number of births for further calculations
number_of_births<-birth.data$births
#converting data frame to time series object
birth.ts <- xts(birth.data[,-1], order.by=birth.data[,1])
#rename second column to appropriate name
names(birth.ts)[1] <- "births"
head(birth.ts)
## births
## 1959-01-01 35
## 1959-01-02 32
## 1959-01-03 30
## 1959-01-04 31
## 1959-01-05 44
## 1959-01-06 29
ts_plot(birth.ts, title = "Daily total female births in California in 1959",
Xtitle = "Month",
Ytitle = "Number of female births",
slider=TRUE)
There is definitely some trend in the time series
Null Hypothesis (H0): There is no autocorrelation between the lags
Alternate Hypothesis (H1): There is significant autocorrelation between the lags
Box.test(x=birth.ts,lag=log(length(birth.ts)))
##
## Box-Pierce test
##
## data: birth.ts
## X-squared = 36.391, df = 5.8999, p-value = 2.088e-06
Since the p-value is less than 0.05, we reject the Null Hypothesis and conclude that there is significant autocorrelation between the lags in the time series data
value(t) = observation(t) - observation(t-1)
ts_plot(diff(birth.ts),title='Differenced data')
#hence, the trend is removed with outliers at Sept 2 and 3
Again checking the p-values for the above differenced data.
Box.test(diff(birth.ts),lag=log(length(birth.ts)))
##
## Box-Pierce test
##
## data: diff(birth.ts)
## X-squared = 78.094, df = 5.8999, p-value = 7.661e-15
#p-value is again very less than 0.05. therefore, autocorrelation is still there and cannot be further eliminated.
par(mfrow=c(2,1))
acf(diff(number_of_births))
pacf(diff(number_of_births))
Hence, we can make out that:
We create 4 models with: p=0,7, d=1(since we have differenced the Time Series once) and q=1,2
model1<-arima(birth.ts,order=c(0,1,1))
model1.test<-Box.test(model1$residuals,lag=log(length(model1$residuals)))
model2<-arima(birth.ts,order=c(7,1,1))
model2.test<-Box.test(model2$residuals,lag=log(length(model2$residuals)))
model3<- arima(birth.ts,order=c(0,1,2))
model3.test<-Box.test(model3$residuals,lag=log(length(model3$residuals)))
model4<-arima(birth.ts,order=c(7,1,2))
model4.test<-Box.test(model4$residuals,lag=log(length(model4$residuals)))
#collecting all data into a single data frame
df<-data.frame(row.names=c("AIC","p-value"),c(model1$aic,model1.test$p.value),
c(model2$aic,model2.test$p.value),
c(model3$aic,model3.test$p.value),
c(model4$aic,model4.test$p.value))
colnames(df)<-c('Arima(0,1,1)','Arima(7,1,1)', 'Arima(0,1,2)', 'Arima(7,1,2)')
df
## Arima(0,1,1) Arima(7,1,1) Arima(0,1,2) Arima(7,1,2)
## AIC 2462.2207021 2464.8827225 2459.5705306 2466.6664136
## p-value 0.5333604 0.9999899 0.9859227 0.9999929
sarima(birth.ts, 0,1,2,0,0,0)
## initial value 2.216721
## iter 2 value 2.047518
## iter 3 value 1.974780
## iter 4 value 1.966955
## iter 5 value 1.958906
## iter 6 value 1.952299
## iter 7 value 1.951439
## iter 8 value 1.950801
## iter 9 value 1.950797
## iter 10 value 1.950650
## iter 11 value 1.950646
## iter 12 value 1.950638
## iter 13 value 1.950635
## iter 13 value 1.950635
## iter 13 value 1.950635
## final value 1.950635
## converged
## initial value 1.950708
## iter 2 value 1.950564
## iter 3 value 1.950290
## iter 4 value 1.950196
## iter 5 value 1.950185
## iter 6 value 1.950185
## iter 7 value 1.950185
## iter 7 value 1.950185
## iter 7 value 1.950185
## final value 1.950185
## converged
## $fit
##
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S),
## xreg = constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.8511 -0.1113 0.015
## s.e. 0.0496 0.0502 0.015
##
## sigma^2 estimated as 49.08: log likelihood = -1226.36, aic = 2460.72
##
## $degrees_of_freedom
## [1] 361
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.8511 0.0496 -17.1448 0.0000
## ma2 -0.1113 0.0502 -2.2164 0.0273
## constant 0.0150 0.0150 1.0007 0.3176
##
## $AIC
## [1] 6.760225
##
## $AICc
## [1] 6.760408
##
## $BIC
## [1] 6.803051
Looking at the p-values of Moving Averages, there is no autocorrelation between the lags. The AIC of the SARIMA model is better than that of ARIMA model.
Looking at the normal Q-Q plot of the Standard Residuals, if fits the time series data almost perfectly.